{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据集下载后转成格式调整  jsonl to json\n",
    "\n",
    "转换后的命令格式如下：\n",
    "\n",
    "```json\n",
    "{\n",
    "  \"instruction\": \"你是一个法律专家，请根据用户的问题给出专业的回答\",\n",
    "  \"input\": \"诈骗罪量刑标准是什么?\",\n",
    "  \"output\": \"诈骗罪指的是以非法占有为目的，使用欺骗方法，骗取数额较大的公私财物的行为...\"\n",
    "}\n",
    "```\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 与 jsonl 数据集放在同一目录下 可以新建一个 python 文件\n",
    "import json\n",
    "\n",
    "# 定义固定的instruction\n",
    "INSTRUCTION = \"你是一个法律专家，请根据用户的问题给出专业的回答\"\n",
    "\n",
    "def process_jsonl(input_file, output_file):\n",
    "    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:\n",
    "        for line in infile:\n",
    "            # 读取每一行并解析JSON\n",
    "            data = json.loads(line)\n",
    "            \n",
    "            # 创建新的字典，包含instruction, input和output\n",
    "            new_data = {\n",
    "                \"instruction\": INSTRUCTION,\n",
    "                \"input\": data[\"input\"],\n",
    "                \"output\": data[\"output\"]\n",
    "            }\n",
    "            \n",
    "            # 将新的字典写入输出文件\n",
    "            json.dump(new_data, outfile, ensure_ascii=False)\n",
    "            outfile.write('\\n')\n",
    "\n",
    "# 使用示例\n",
    "input_file = \"DISC-Law-SFT-Pair-QA-released.jsonl\"\n",
    "output_file = \"DISC-Law-SFT-Pair-QA-released-new.jsonl\"\n",
    "\n",
    "process_jsonl(input_file, output_file)\n",
    "print(f\"处理完成。输出文件：{output_file}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 包的导入"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import torch\n",
    "from datasets import Dataset\n",
    "from modelscope import snapshot_download, AutoTokenizer\n",
    "from swanlab.integration.transformers import SwanLabCallback\n",
    "from peft import LoraConfig, TaskType, get_peft_model\n",
    "from transformers import (\n",
    "    AutoModelForCausalLM,\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    "    DataCollatorForSeq2Seq,\n",
    ")\n",
    "import swanlab"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据集预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_func(example):\n",
    "    \"\"\"\n",
    "    将数据集进行预处理\n",
    "    \"\"\"\n",
    "    MAX_LENGTH = 384\n",
    "    input_ids, attention_mask, labels = [], [], []\n",
    "    instruction = tokenizer(\n",
    "        f\"<|im_start|>system\\n{example['instruction']}<|im_end|>\\n<|im_start|>user\\n{example['input']}<|im_end|>\\n<|im_start|>assistant\\n\",\n",
    "        add_special_tokens=False,\n",
    "    )\n",
    "    response = tokenizer(f\"{example['output']}\", add_special_tokens=False)\n",
    "    input_ids = (\n",
    "        instruction[\"input_ids\"] + response[\"input_ids\"] + [tokenizer.pad_token_id]\n",
    "    )\n",
    "    attention_mask = instruction[\"attention_mask\"] + response[\"attention_mask\"] + [1]\n",
    "    labels = (\n",
    "        [-100] * len(instruction[\"input_ids\"])\n",
    "        + response[\"input_ids\"]\n",
    "        + [tokenizer.pad_token_id]\n",
    "    )\n",
    "    if len(input_ids) > MAX_LENGTH:  # 做一个截断\n",
    "        input_ids = input_ids[:MAX_LENGTH]\n",
    "        attention_mask = attention_mask[:MAX_LENGTH]\n",
    "        labels = labels[:MAX_LENGTH]\n",
    "        \n",
    "    return {\"input_ids\": input_ids, \"attention_mask\": attention_mask, \"labels\": labels}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_jsonl_path = \"DISC-Law-SFT-Pair-QA-released-new.jsonl\"\n",
    "train_df = pd.read_json(train_jsonl_path, lines=True)[5:5000]\n",
    "train_ds = Dataset.from_pandas(train_df)\n",
    "train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)\n",
    "test_df = pd.read_json(train_jsonl_path, lines=True)[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型下载，并加载到 Transformers 模型中\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 在modelscope上下载Qwen模型到本地目录下\n",
    "model_dir = snapshot_download(\"Qwen/Qwen2.5-Coder-7B-Instruct\", cache_dir=\"/root/autodl-tmp\", revision=\"master\")\n",
    "\n",
    "# Transformers加载模型权重\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"/root/autodl-tmp/Qwen/Qwen2___5-Coder-7B-Instruct/\", use_fast=False, trust_remote_code=True)\n",
    "model = AutoModelForCausalLM.from_pretrained(\"/root/autodl-tmp/Qwen/Qwen2___5-Coder-7B-Instruct/\", device_map=\"auto\", torch_dtype=torch.bfloat16)\n",
    "model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型预测函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(messages, model, tokenizer):\n",
    "    device = \"cuda\"\n",
    "    text = tokenizer.apply_chat_template(\n",
    "        messages, tokenize=False, add_generation_prompt=True\n",
    "    )\n",
    "    model_inputs = tokenizer([text], return_tensors=\"pt\").to(device)\n",
    "\n",
    "    generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)\n",
    "    generated_ids = [\n",
    "        output_ids[len(input_ids) :]\n",
    "        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\n",
    "    ]\n",
    "\n",
    "    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
    "\n",
    "    return response\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## peft model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = LoraConfig(\n",
    "    task_type=TaskType.CAUSAL_LM,\n",
    "    target_modules=[\n",
    "        \"q_proj\",\n",
    "        \"k_proj\",\n",
    "        \"v_proj\",\n",
    "        \"o_proj\",\n",
    "        \"gate_proj\",\n",
    "        \"up_proj\",\n",
    "        \"down_proj\",\n",
    "    ],\n",
    "    inference_mode=False,  # 训练模式\n",
    "    r=64,  # Lora 秩\n",
    "    lora_alpha=16,  # Lora alaph，具体作用参见 Lora 原理\n",
    "    lora_dropout=0.1,  # Dropout 比例\n",
    ")\n",
    "\n",
    "peft_model = get_peft_model(model, config)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设定参数并指定 Callback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = TrainingArguments(\n",
    "    output_dir=\"./output/Qwen2.5-Coder-7b\",\n",
    "    per_device_train_batch_size=2,\n",
    "    gradient_accumulation_steps=8,\n",
    "    logging_steps=10,\n",
    "    num_train_epochs=1,\n",
    "    save_steps=100,\n",
    "    learning_rate=1e-4,\n",
    "    save_on_each_node=True,\n",
    "    gradient_checkpointing=True,\n",
    "    report_to=\"none\",\n",
    ")\n",
    "\n",
    "class HuanhuanSwanLabCallback(SwanLabCallback):   \n",
    "    def on_train_begin(self, args, state, control, model=None, **kwargs):\n",
    "        if not self._initialized:\n",
    "            self.setup(args, state, model, **kwargs)\n",
    "            \n",
    "        print(\"训练开始\")\n",
    "        print(\"未开始微调，先取3条主观评测：\")\n",
    "        test_text_list = []\n",
    "        for index, row in test_df[:3].iterrows():\n",
    "            instruction = row[\"instruction\"]\n",
    "            input_value = row[\"input\"]\n",
    "\n",
    "            messages = [\n",
    "                {\"role\": \"system\", \"content\": f\"{instruction}\"},\n",
    "                {\"role\": \"user\", \"content\": f\"{input_value}\"},\n",
    "            ]\n",
    "\n",
    "            response = predict(messages, peft_model, tokenizer)\n",
    "            messages.append({\"role\": \"assistant\", \"content\": f\"{response}\"})\n",
    "                \n",
    "            result_text = f\"【Q】{messages[1]['content']}\\n【LLM】{messages[2]['content']}\\n\"\n",
    "            print(result_text)\n",
    "            \n",
    "            test_text_list.append(swanlab.Text(result_text, caption=response))\n",
    "\n",
    "        swanlab.log({\"Prediction\": test_text_list}, step=0)\n",
    "    \n",
    "    def on_epoch_end(self, args, state, control, **kwargs):\n",
    "        # ===================测试阶段======================\n",
    "        test_text_list = []\n",
    "        for index, row in test_df.iterrows():\n",
    "            instruction = row[\"instruction\"]\n",
    "            input_value = row[\"input\"]\n",
    "            ground_truth = row[\"output\"]\n",
    "\n",
    "            messages = [\n",
    "                {\"role\": \"system\", \"content\": f\"{instruction}\"},\n",
    "                {\"role\": \"user\", \"content\": f\"{input_value}\"},\n",
    "            ]\n",
    "\n",
    "            response = predict(messages, peft_model, tokenizer)\n",
    "            messages.append({\"role\": \"assistant\", \"content\": f\"{response}\"})\n",
    "            \n",
    "            if index == 0:\n",
    "                print(\"epoch\", round(state.epoch), \"主观评测：\")\n",
    "                \n",
    "            result_text = f\"【Q】{messages[1]['content']}\\n【LLM】{messages[2]['content']}\\n【GT】 {ground_truth}\"\n",
    "            print(result_text)\n",
    "            \n",
    "            test_text_list.append(swanlab.Text(result_text, caption=response))\n",
    "\n",
    "        swanlab.log({\"Prediction\": test_text_list}, step=round(state.epoch))\n",
    "        \n",
    "        \n",
    "swanlab_callback = HuanhuanSwanLabCallback(\n",
    "    project=\"Qwen2.5-Coder-LoRA-Law\",\n",
    "    experiment_name=\"7b\",\n",
    "    config={\n",
    "        \"model\": \"https://modelscope.cn/models/Qwen/Qwen2.5-Coder-7B-Instruct\",\n",
    "        \"dataset\": \"https://huggingface.co/datasets/ShengbinYue/DISC-Law-SFT\",\n",
    "        \"github\": \"https://github.com/datawhalechina/self-llm\",\n",
    "        \"system_prompt\": \"你是一个法律专家，请根据用户的问题给出专业的回答\",\n",
    "        \"lora_rank\": 64,\n",
    "        \"lora_alpha\": 16,\n",
    "        \"lora_dropout\": 0.1,\n",
    "    },\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 开始训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=peft_model,\n",
    "    args=args,\n",
    "    train_dataset=train_dataset,\n",
    "    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),\n",
    "    callbacks=[swanlab_callback],\n",
    ")\n",
    "\n",
    "trainer.train()\n",
    "\n",
    "swanlab.finish()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
